"""
Code plots the leadin figures. Code is written in python 3.

by: Jacob Malone (j.malone@cablelabs.com)
date: August 25, 2021
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.width = None

# Define working directory and load files from the "03_id_cord_cutters.py" files
data_dir = '/mnt/et01/cord_cutting_rep'

cord_cutters = pd.read_parquet(f'{data_dir}/04_cord_cutters.h5')
video_dates = pd.read_parquet(f'{data_dir}/04_video_dates.h5')

#####

cord_cutters = cord_cutters.merge(video_dates, on='customer_key', how='left')
cord_cutters['days_to_cut'] = cord_cutters.date - cord_cutters.min_dt_wo_video
cord_cutters['week_num'] = cord_cutters.date.dt.week
cord_cutters['week_of_cut'] = cord_cutters.min_dt_wo_video.dt.week
cord_cutters['weeks_to_cut'] = cord_cutters.week_num - cord_cutters.week_of_cut
cord_cutters['days_w_video'] = cord_cutters.max_dt_w_video - cord_cutters.min_dt_w_video
cord_cutters['days_wo_video'] = cord_cutters.max_dt_wo_video - cord_cutters.min_dt_wo_video

# Restrict sample to only include cord-cutters with 7 days on either side
cord_cutters = cord_cutters[((cord_cutters.days_w_video >= pd.Timedelta(days=7)) &
                             (cord_cutters.days_wo_video >= pd.Timedelta(days=7)))].copy()
print(cord_cutters.customer_key.nunique())

#####

video_cols = sorted([_ for _ in cord_cutters.columns if _ != 'level_video' and _.startswith('level_vid')])

# Determine the first week an OTT video service was used
first_wk_cols = []
for vcol in video_cols:
    vcol_usage = cord_cutters.groupby(['customer_key', 'weeks_to_cut'])[vcol].sum().reset_index()  # Get weekly totals
    
    # Identify users and their first week of usage
    vcol_pos_usage = vcol_usage[(vcol_usage[vcol] > 0)].copy()
    vcol_first_wk = vcol_pos_usage.groupby('customer_key').weeks_to_cut.min().reset_index()
    vcol_first_wk.rename(columns={'weeks_to_cut': 'firstwk_{0}'.format(vcol)}, inplace=True)
    
    # Merge results back into DataFrame
    cord_cutters = pd.merge(cord_cutters, vcol_first_wk, on='customer_key', how='left')
    cord_cutters['firstwk_{0}'.format(vcol)] = cord_cutters['firstwk_{0}'.format(vcol)].fillna(999999)
    
    # Store variable for later usage
    first_wk_cols.append('firstwk_{0}'.format(vcol))

# Filter down to cord-cutters that have at least 10 weeks before the cut and focus on the last 8 weeks before (for figure)
cord_cutters = cord_cutters[((cord_cutters.weeks_to_cut >= -10)&(cord_cutters.weeks_to_cut <= 0))].copy()
cord_cutters['weeks_obs'] = cord_cutters.groupby('customer_key').weeks_to_cut.transform('nunique')
df_cordcuts = cord_cutters[(cord_cutters.weeks_obs == 11)].copy()
df_cordcuts = df_cordcuts[((df_cordcuts.weeks_to_cut >= -8)&(df_cordcuts.weeks_to_cut <= 0))].copy()

# Collapse data to the week level of observation
collapse_dict = {}
for col in video_cols:
    collapse_dict[col] = 'mean'
for col in first_wk_cols:
    collapse_dict[col] = 'first'
cord_cutters = cord_cutters.groupby(['customer_key', 'weeks_to_cut']).agg(collapse_dict).reset_index()

#####

# Loop over the OTTV columns and weeks to generate statistics.
excel_writer = pd.ExcelWriter('cord_cut_lead_in_stats.xlsx')
vcol_stats = {}
for vcol in video_cols:
    vcol_stats[vcol] = {}
    for rweek in range(-8, 1):
        week_dat = cord_cutters[(cord_cutters.weeks_to_cut == rweek)].copy()  # Filter to that week

        num_total_users = week_dat.customer_key.nunique()

        # Indicator for positive usage of the OTTV application
        week_dat['pos_usage'] = 0
        week_dat.loc[week_dat[vcol] > 0, 'pos_usage'] = 1

        # Get subscriber types for when they became active
        week_dat['existing_user'] = 0
        week_dat.loc[(week_dat['firstwk_{0}'.format(vcol)] < rweek)&(week_dat.pos_usage == 1), 'existing_user'] = 1
        num_existing_users = week_dat.existing_user.sum()

        week_dat['new_user'] = 0
        week_dat.loc[(week_dat['firstwk_{0}'.format(vcol)] == rweek)&(week_dat.pos_usage == 1), 'new_user'] = 1
        num_new_users = week_dat.new_user.sum()

        week_dat['longterm_user'] = 0
        week_dat.loc[(week_dat['firstwk_{0}'.format(vcol)] <= -10)&(week_dat.pos_usage == 1), 'longterm_user'] = 1
        num_longterm_users = week_dat.longterm_user.sum()

        # Record usage
        existing_daily_avg = week_dat[week_dat.existing_user == 1][vcol].mean()
        new_daily_avg = week_dat[week_dat.new_user == 1][vcol].mean()
        longterm_daily_avg = week_dat[week_dat.longterm_user == 1][vcol].mean()

        # Store results in a dictionary
        vcol_stats[vcol][rweek] = {}
        vcol_stats[vcol][rweek]['existing_daily_avg'] = existing_daily_avg
        vcol_stats[vcol][rweek]['longterm_daily_avg'] = longterm_daily_avg
        vcol_stats[vcol][rweek]['new_daily_avg'] = new_daily_avg
        vcol_stats[vcol][rweek]['num_existing_users'] = num_existing_users
        vcol_stats[vcol][rweek]['num_longterm_users'] = num_longterm_users
        vcol_stats[vcol][rweek]['num_new_users'] = num_new_users
        vcol_stats[vcol][rweek]['num_total_users'] = num_total_users

    vcol_frame = pd.DataFrame(vcol_stats[vcol]).transpose()
    vcol_frame.columns = ['Existing Users Daily Avg',
                          'Longterm Users Daily Avg',
                          'New Users Daily Avg',
                          'Number of Active Existing Users',
                          'Number of Active Longterm Users',
                          'Number of Active New Users',
                          'Total Number of Cord Cutters']
    vcol_frame.reset_index(drop=False, inplace=True)
    vcol_frame.rename(columns={'index':'Weeks Relative to Cord Cut'}, inplace=True)
    vcol_frame.to_excel(excel_writer, vcol, index=False)
excel_writer.save()

#####

# FIGURE A4

def plot_leadin_mins(dates, leadin_avgs, filename):

    f, ax = plt.subplots()
    ax.xaxis.grid(color="gray", linewidth=0.5)
    ax.yaxis.grid(color="gray", linewidth=0.5)
    ax.set_axisbelow(True)

    plot0 = ax.plot(dates,
                    leadin_avgs.loc[:, 'Hulu'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#009933",
                    marker="^",
                    markersize=10,
                    markerfacecolor="#009933")

    plot1 = ax.plot(dates,
                    leadin_avgs.loc[:, 'Netflix'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#330099",
                    marker="o",
                    markersize=10,
                    markerfacecolor="#330099")
    
    plot2 = ax.plot(dates,
                    leadin_avgs.loc[:, 'Sling TV'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#993300",
                    marker="s",
                    markersize=10,
                    markerfacecolor="#993300")

    plot3 = ax.plot(dates,
                    leadin_avgs.loc[:, 'YouTube'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#990066",
                    marker="D",
                    markersize=10,
                    markerfacecolor="#990066")

    ax.set_xlim([-8.5, 0.5])
    ax.set_xticks(range(-8, 1, 2))
    ax.set_xlabel('Weeks Relative to Cord-Cut Date', size=20)
    ax.set_ylabel('Estimated Minutes per Day', size=20)

    ax.tick_params(axis='x',
                   bottom='on',
                   top='off',
                   which='both',
                   labelsize=16)
    ax.tick_params(axis='y',
                   left='off',
                   right='off',
                   which='both',
                   labelsize=16)
                   
    ax.annotate(r'\textbf{Hulu}',
                xy=(-8.1, 55),
                xycoords='data',
                horizontalalignment='left',
                verticalalignment='center',
                fontsize=14)
    ax.annotate(r'\textbf{Netflix}',
                xy=(-8.1, 108),
                xycoords='data',
                horizontalalignment='left',
                verticalalignment='center',
                fontsize=14)
    ax.annotate(r'\textbf{Sling TV}',
                xy=(-8.1, 70),
                xycoords='data',
                horizontalalignment='left',
                verticalalignment='center',
                fontsize=14)
    ax.annotate(r'\textbf{YouTube}',
                xy=(-8.1, 40),
                xycoords='data',
                horizontalalignment='left',
                verticalalignment='center',
                fontsize=14)

    f.savefig('./figs/{0}.eps'.format(filename), bbox_inches='tight')
    f.savefig('./figs/{0}.png'.format(filename), bbox_inches='tight')
    
    plt.close(f)


hulu_dat = []
netflix_dat = []
sling_dat = []
youtube_dat = []
for week_num in range(-8, 1):
    print(week_num)
    hulu_dat.append(vcol_stats['level_vidhulu'][week_num]['longterm_daily_avg'])
    netflix_dat.append(vcol_stats['level_vidnetflix'][week_num]['longterm_daily_avg'])
    sling_dat.append(vcol_stats['level_vidslingtv'][week_num]['longterm_daily_avg'])
    youtube_dat.append(vcol_stats['level_vidyoutube'][week_num]['longterm_daily_avg'])
leadin_dat = pd.DataFrame([hulu_dat, netflix_dat, sling_dat, youtube_dat]).transpose()
leadin_dat.columns = ['Hulu', 'Netflix', 'Sling TV', 'YouTube']

# Convert to minutes per day
leadin_dat['Hulu'] = (leadin_dat["Hulu"] / 0.97)*60
leadin_dat['Netflix'] = (leadin_dat["Netflix"] / 1.36)*60
leadin_dat['Sling TV'] = (leadin_dat["Sling TV"] / 1.41)*60
leadin_dat['YouTube'] = (leadin_dat["YouTube"] / 0.88)*60

plot_leadin_mins(range(-8, 1), leadin_dat, '05_leadin_minutes')

#####

# FIGURE A7

def plot_leadin_users_line(dates, leadin_avgs, filename):

    f, ax = plt.subplots()
    ax.xaxis.grid(color="gray", linewidth=0.5)
    ax.yaxis.grid(color="gray", linewidth=0.5)
    ax.set_axisbelow(True)

    plot0 = ax.plot(dates,
                    leadin_avgs.loc[:, 'Hulu'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#009933",
                    marker="^",
                    markersize=10,
                    markerfacecolor="#009933")

    plot1 = ax.plot(dates,
                    leadin_avgs.loc[:, 'Netflix'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#330099",
                    marker="o",
                    markersize=10,
                    markerfacecolor="#330099")
    
    plot2 = ax.plot(dates,
                    leadin_avgs.loc[:, 'Sling TV'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#993300",
                    marker="s",
                    markersize=10,
                    markerfacecolor="#993300")

    plot3 = ax.plot(dates,
                    leadin_avgs.loc[:, 'YouTube'],
                    linestyle="dashed",
                    linewidth=2,
                    color="#990066",
                    marker="D",
                    markersize=10,
                    markerfacecolor="#990066")

    ax.set_xlim([-8.5, 0.5])
    ax.set_xticks(range(-8, 1, 2))
    ax.set_xlabel(r'Weeks Relative to Cord-Cut Date', size=20)
    ax.set_ylabel(r'Marginal Active Subscribers', size=20)

    ax.tick_params(axis='x',
                   bottom='on',
                   top='off',
                   which='both',
                   labelsize=16)
    ax.tick_params(axis='y',
                   left='off',
                   right='off',
                   which='both',
                   labelsize=16)
                   
    ax.annotate(r'\shortstack{\textbf{Hulu}\\+16\%}',
                xy=(0, 100),
                xycoords='data',
                horizontalalignment='right',
                verticalalignment='center',
                fontsize=14)
    ax.annotate(r'\shortstack{\textbf{Netflix}\\+9\%}',
                xy=(0, 525),
                xycoords='data',
                horizontalalignment='right',
                verticalalignment='center',
                fontsize=14)
    ax.annotate(r'\shortstack{\textbf{Sling TV}\\+282\%}',
                xy=(-0.1, 325),
                xycoords='data',
                horizontalalignment='right',
                verticalalignment='center',
                fontsize=14)
    ax.annotate(r'\shortstack{\textbf{YouTube}\\-9\%}',
                xy=(-0.75, -525),
                xycoords='data',
                horizontalalignment='right',
                verticalalignment='center',
                fontsize=14)

    f.savefig('./figs/{0}.eps'.format(filename), bbox_inches='tight')
    f.savefig('./figs/{0}.png'.format(filename), bbox_inches='tight')
    
    plt.close(f)


hulu_dat = []
netflix_dat = []
sling_dat = []
youtube_dat = []

hulu_first_value = 0
netflix_first_value = 0
sling_first_value = 0
youtube_first_value = 0
for week_num in range(-8, 1):
    print(week_num)
    hulu_array = (np.array(vcol_stats['level_vidhulu'][week_num]['num_existing_users']) + 
                  np.array(vcol_stats['level_vidhulu'][week_num]['num_new_users']))
    if week_num == -8:
        hulu_first_value = hulu_array
    hulu_dat.append(hulu_array - hulu_first_value)
    
    netflix_array = (np.array(vcol_stats['level_vidnetflix'][week_num]['num_existing_users']) + 
                     np.array(vcol_stats['level_vidnetflix'][week_num]['num_new_users']))
    if week_num == -8:
        netflix_first_value = netflix_array
    netflix_dat.append(netflix_array - netflix_first_value)
    
    sling_array = (np.array(vcol_stats['level_vidslingtv'][week_num]['num_existing_users']) + 
                   np.array(vcol_stats['level_vidslingtv'][week_num]['num_new_users']))
    if week_num == -8:
        sling_first_value = sling_array
    sling_dat.append(sling_array - sling_first_value)
    
    youtube_array = (np.array(vcol_stats['level_vidyoutube'][week_num]['num_existing_users']) + 
                     np.array(vcol_stats['level_vidyoutube'][week_num]['num_new_users']))
    if week_num == -8:
        youtube_first_value = youtube_array
    youtube_dat.append(youtube_array - youtube_first_value)
leadin_dat = pd.DataFrame([hulu_dat, netflix_dat, sling_dat, youtube_dat]).transpose()
leadin_dat.columns = ['Hulu', 'Netflix', 'Sling TV', 'YouTube']

plot_leadin_users_line(range(-8, 1), leadin_dat, '06_leadin_users_line')
